Techniques for Avoiding Overfitting, Pt. 1
Feature Engineering

Behavioral Data Science in R II
Unit 2
Module 6

Overview

  • Cross validation
  • Feature engineering
    • Feature selection
    • Feature extraction
  • Regularization
  • Ensemble methods

The data: cellphone sensors X activity

The data: cellphone sensors X activity

dim(df)
[1] 6471  113

Potential problems with high-dimensional data

  • Including irrelevant predictors -> Overfitting
  • Multicollinearity
    • Less-than-perfect collinearity is fine for prediction, but can be a problem for inference
    • Perfect collinearity and regression models can’t be fit
  • More features -> more time & data to train

Feature selection:

Example Data

n_obs = 1000
sim_lin_df = 
  tibble(
    x1 = rnorm(n_obs,0,5),
    x2 = rnorm(n_obs,0,10),
    x3 = rnorm(n_obs,0,15),
    y = 4.17 + 8.06*x1 + 8.06*x2 + 8.06*x3 + rnorm(n_obs,0,10)
    ) %>% 
  mutate(
    noise1 = rnorm(n_obs,0,10), 
    noise2 = rnorm(n_obs,0,10), 
    perfectCorr_x2 = 2.55 + 3.94*x2, 
    highCorr_x3 = 7.92 + 4.37*x3 + rnorm(n_obs,0,40), 
    linComb_x2_x3 = 17 + 3.1*x2 + 4.7*x3, # x6 is a linear combination of x1 and x2
    zeroVar = rep(1.24, n_obs), # x5 has zero variance
  ) 
[1] "Corr(x3, highCorr_x3):"
[1] 0.8506153

Feature selection:

Filtering methods

rec <- recipe(y ~ ., sim_lin_df) %>% 
  step_nzv(all_predictors()) 

Kept

[1] "x1"             "x2"             "x3"             "noise1"        
[5] "noise2"         "perfectCorr_x2" "highCorr_x3"    "linComb_x2_x3" 
[9] "y"             

Removed

[1] "zeroVar"
rec <- recipe(y ~ ., sim_lin_df) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_predictors()) 

Kept

[1] "x1"             "x2"             "x3"             "noise1"        
[5] "noise2"         "perfectCorr_x2" "highCorr_x3"    "y"             

Removed

[1] "linComb_x2_x3" "zeroVar"      
rec <- recipe(y ~ ., sim_lin_df) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_predictors()) %>% 
  step_corr(all_predictors(), threshold = .9) 

Kept

[1] "x1"             "x3"             "noise1"         "noise2"        
[5] "perfectCorr_x2" "highCorr_x3"    "y"             

Removed

[1] "x2"            "linComb_x2_x3" "zeroVar"      
rec <- recipe(y ~ ., sim_lin_df) %>% 
  step_nzv(all_predictors()) %>% 
  step_lincomb(all_predictors()) %>% 
  step_corr(all_predictors(), threshold = .7) 

Kept

[1] "x1"             "x3"             "noise1"         "noise2"        
[5] "perfectCorr_x2" "y"             

Removed

[1] "x2"            "highCorr_x3"   "linComb_x2_x3" "zeroVar"      

Feature selection:

Feature importance

lr_fit <- workflow() %>% 
  add_recipe(rec) %>% 
  add_model(linear_reg()) %>% 
  fit(sim_lin_df)

tidy(lr_fit) %>% arrange(desc(p.value))
# A tibble: 6 × 5
  term            estimate std.error statistic  p.value
  <chr>              <dbl>     <dbl>     <dbl>    <dbl>
1 noise1          0.000322   0.0315     0.0102 0.992   
2 noise2          0.0453     0.0318     1.43   0.154   
3 (Intercept)    -1.14       0.316     -3.59   0.000342
4 x1              8.06       0.0636   127.     0       
5 x3              8.10       0.0214   378.     0       
6 perfectCorr_x2  2.04       0.00797  256.     0       
rec2 <- rec %>% 
  step_rm(all_of("noise2"))

lr_fit <- workflow() %>% 
  add_recipe(rec2) %>% 
  add_model(linear_reg()) %>% 
  fit(sim_lin_df)

tidy(lr_fit) %>% arrange(desc(p.value))
# A tibble: 5 × 5
  term           estimate std.error statistic  p.value
  <chr>             <dbl>     <dbl>     <dbl>    <dbl>
1 noise1          0.00117   0.0315     0.0371 0.970   
2 (Intercept)    -1.12      0.316     -3.54   0.000423
3 x1              8.06      0.0636   127.     0       
4 x3              8.09      0.0214   378.     0       
5 perfectCorr_x2  2.04      0.00796  257.     0       
rec2 <- rec %>% 
  step_rm(all_of(c("noise2","noise1")))

lr_fit <- workflow() %>% 
  add_recipe(rec2) %>% 
  add_model(linear_reg()) %>% 
  fit(sim_lin_df)

tidy(lr_fit) %>% arrange(desc(p.value))
# A tibble: 4 × 5
  term           estimate std.error statistic  p.value
  <chr>             <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)       -1.12   0.316       -3.54 0.000413
2 x1                 8.06   0.0636     127.   0       
3 x3                 8.09   0.0214     378.   0       
4 perfectCorr_x2     2.04   0.00793    258.   0       

Feature selection:

Feature importance


rf_spec <- 
  rand_forest(
    mode="regression",
    trees=500) %>% 
  set_mode("regression") %>% 
  set_engine(
    "ranger", 
    importance = "impurity")

rf_fit <- 
  workflow() %>% 
  add_recipe(rec) %>%
  add_model(rf_spec) %>%  
  fit(sim_lin_df) 

Feature selection:

Wrapper methods

A.k.a.:

  • Forwards addition
  • Backwards elimination

Feature Extraction

Dimensionality Reduction

  • Principal Components Analysis (PCA)
  • Independent Components Analysis (ICA)
  • Linear Discriminant Analysis (LDA)
  • t-distributed Stochastic Neighbor Embedding (t-SNE)

PCA

pca_rec <- recipe(y ~ ., sim_lin_df) %>% 
  step_nzv(all_predictors()) %>% 
  step_normalize(all_predictors()) %>% 
  step_pca(all_predictors(), num_comp = 3)

pca_estimates <- prep(pca_rec, sim_lin_df)
pca_data <- bake(pca_estimates, sim_lin_df)

tidy(pca_estimates, number=3)
# A tibble: 64 × 4
   terms            value component id       
   <chr>            <dbl> <chr>     <chr>    
 1 x1              0.0217 PC1       pca_EoYnc
 2 x2              0.317  PC1       pca_EoYnc
 3 x3              0.494  PC1       pca_EoYnc
 4 noise1          0.0299 PC1       pca_EoYnc
 5 noise2         -0.0518 PC1       pca_EoYnc
 6 perfectCorr_x2  0.317  PC1       pca_EoYnc
 7 highCorr_x3     0.469  PC1       pca_EoYnc
 8 linComb_x2_x3   0.575  PC1       pca_EoYnc
 9 x1              0.132  PC2       pca_EoYnc
10 x2              0.595  PC2       pca_EoYnc
# ℹ 54 more rows

PCA

tidy(pca_estimates, number=3, type = "variance") %>% filter(terms=="cumulative percent variance")
# A tibble: 8 × 4
  terms                       value component id       
  <chr>                       <dbl>     <int> <chr>    
1 cumulative percent variance  36.3         1 pca_EoYnc
2 cumulative percent variance  60.7         2 pca_EoYnc
3 cumulative percent variance  73.3         3 pca_EoYnc
4 cumulative percent variance  85.7         4 pca_EoYnc
5 cumulative percent variance  97.6         5 pca_EoYnc
6 cumulative percent variance 100           6 pca_EoYnc
7 cumulative percent variance 100           7 pca_EoYnc
8 cumulative percent variance 100           8 pca_EoYnc

PCA

PCA

Applying these techniques to the Human Activity Recognition Data

dim(df)
[1] 6471  113

Regularization

# A tibble: 22 × 7
   penalty .metric .estimator  mean     n  std_err .config              
     <dbl> <chr>   <chr>      <dbl> <int>    <dbl> <chr>                
 1     0   rmse    standard   9.85      5 0.254    Preprocessor1_Model01
 2     0   rsq     standard   0.996     5 0.000467 Preprocessor1_Model01
 3     0.1 rmse    standard   9.85      5 0.254    Preprocessor1_Model02
 4     0.1 rsq     standard   0.996     5 0.000467 Preprocessor1_Model02
 5     0.2 rmse    standard   9.85      5 0.254    Preprocessor1_Model03
 6     0.2 rsq     standard   0.996     5 0.000467 Preprocessor1_Model03
 7     0.3 rmse    standard   9.85      5 0.254    Preprocessor1_Model04
 8     0.3 rsq     standard   0.996     5 0.000467 Preprocessor1_Model04
 9     0.4 rmse    standard   9.85      5 0.254    Preprocessor1_Model05
10     0.4 rsq     standard   0.996     5 0.000467 Preprocessor1_Model05
# ℹ 12 more rows
# A tibble: 9 × 3
  term           estimate penalty
  <chr>             <dbl>   <dbl>
1 (Intercept)    3.68e+ 0     0.8
2 x1             3.91e+ 1     0.8
3 x2             2.77e+ 1     0.8
4 x3             0            0.8
5 noise1         0            0.8
6 noise2         0            0.8
7 perfectCorr_x2 2.99e-12     0.8
8 highCorr_x3    0            0.8
9 linComb_x2_x3  1.27e+ 2     0.8